## # A tibble: 160,318,860 × 7
## FIPSid year FIPS_Code_State fips Enrollment item amount
## <chr> <int> <chr> <chr> <int> <chr> <int>
## 1 00015003250371 2021 01 01003 2221 SchLevCode 3
## 2 00015003250371 2021 01 01003 2221 Total_Revenue 39506
## 3 00015003250371 2021 01 01003 2221 Total_Rev_Own_S… 7576
## 4 00015003250371 2021 01 01003 2221 General_Revenue 39506
## 5 00015003250371 2021 01 01003 2221 Gen_Rev_Own_Sou… 7576
## 6 00015003250371 2021 01 01003 2221 Total_Taxes 5610
## 7 00015003250371 2021 01 01003 2221 Property_Tax 5609
## 8 00015003250371 2021 01 01003 2221 Tot_Sales___Gr_… 0
## 9 00015003250371 2021 01 01003 2221 Total_Gen_Sales… 0
## 10 00015003250371 2021 01 01003 2221 Total_Select_Sa… 0
## # ℹ 160,318,850 more rows
## # A tibble: 616,015 × 7
## FIPSid year FIPS_Code_State fips SchLevCode Enrollment Total_Revenue
## <chr> <int> <chr> <chr> <int> <int> <int>
## 1 000150011778… 1970 01 01001 NA 6800 3226
## 2 000150011778… 1971 01 01001 NA 6800 3548
## 3 000150011778… 1972 01 01001 NA 7163 3579
## 4 000150011778… 1973 01 01001 NA 6800 3758
## 5 000150011778… 1974 01 01001 NA 7163 3642
## 6 000150011778… 1975 01 01001 NA 7163 5312
## 7 000150011778… 1976 01 01001 NA 7163 6111
## 8 000150011778… 1977 01 01001 NA 7179 6382
## 9 000150011778… 1978 01 01001 NA 7163 7265
## 10 000150011778… 1979 01 01001 NA 7271 8648
## # ℹ 616,005 more rows
~ 10% missing values for the Total revenue category
## [1] 0.8947129
## # A tibble: 58 × 265
## FIPSid year FIPS_Code_State fips SchLevCode Enrollment Total_Revenue
## <chr> <int> <chr> <chr> <int> <int> <int>
## 1 21 2002 04 04015 2 2234 0
## 2 252 1997 06 06023 1 2937 0
## 3 000650371845… 1997 06 06037 4 2445 0
## 4 000650591236… 1997 06 06059 4 20014 0
## 5 000650591236… 1999 06 06059 4 9549 0
## 6 000650612014… 1997 06 06061 4 6909 0
## 7 000650612014… 2002 06 06061 4 6909 0
## 8 364 2002 06 06085 1 1456 0
## 9 000650851236… 1997 06 06085 4 9728 0
## 10 367 1997 06 06087 2 5682 0
## # ℹ 48 more rows
## # ℹ 258 more variables: Total_Rev_Own_Sources <int>, General_Revenue <int>,
## # Gen_Rev_Own_Sources <int>, Total_Taxes <int>, Property_Tax <int>,
## # Tot_Sales___Gr_Rec_Tax <int>, Total_Gen_Sales_Tax <int>,
## # Total_Select_Sales_Tax <int>, Public_Utility_Tax <int>, Tobacco_Tax <int>,
## # Other_Select_Sales_Tax <int>, Total_Income_Taxes <int>,
## # Individual_Income_Tax <int>, Taxes_NEC <int>, Total_IG_Revenue <int>, …
temp_cleaned %>%
filter(Total_Revenue != 0) %>%
group_by(FIPSid) %>%
summarise(years = n()) %>%
ggplot() +
geom_histogram(aes(x = years), binwidth = 3) +
ggtitle("Histogram of reported years over 31-year period between 1970-2021")
temp_cleaned %>%
filter(Total_Revenue != 0) %>%
group_by(year) %>%
summarise(reporting = n()) %>%
ggplot() +
geom_bar(aes(x = year, weight = reporting)) +
ggtitle("Reporting over 31-year period between 1970-2021")
temp_short %>%
group_by(FIPS_Code_State) %>%
summarise(n_dist = n_distinct(FIPSid)) %>%
ggplot() +
geom_col(aes(x = fct_rev(fct_reorder(as.factor(FIPS_Code_State), n_dist)), y = n_dist)) +
labs(title = "Number of schools per state")
temp_short %>%
group_by(FIPS_Code_State) %>%
summarise(n_dist = n_distinct(FIPSid)) %>%
ggplot() +
geom_histogram(aes(n_dist), bins = 10) +
labs(title = "Histogram of number of school districts per state")
for(k in unique(temp_short$FIPS_Code_State)){
print(k)
print(get_state(k))
temp_short %>%
filter(FIPS_Code_State == k) %>%
panelview(Total_Revenue != 0 ~ 1, index = c("FIPSid","year"), xlab = "Year", ylab = "School District", display.all = TRUE, type = "missing")
}
## [1] "01"
## [1] "AL"
## [1] "04"
## [1] "AZ"
## [1] "05"
## [1] "AR"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "06"
## [1] "CA"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "08"
## [1] "CO"
## [1] "09"
## [1] "CT"
## [1] "12"
## [1] "FL"
## [1] "17"
## [1] "IL"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "18"
## [1] "IN"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "20"
## [1] "KS"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "23"
## [1] "ME"
## [1] "25"
## [1] "MA"
## [1] "26"
## [1] "MI"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "27"
## [1] "MN"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "28"
## [1] "MS"
## [1] "29"
## [1] "MO"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "30"
## [1] "MT"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "31"
## [1] "NE"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "34"
## [1] "NJ"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "36"
## [1] "NY"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "38"
## [1] "ND"
## [1] "39"
## [1] "OH"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "40"
## [1] "OK"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "42"
## [1] "PA"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "45"
## [1] "SC"
## [1] "46"
## [1] "SD"
## [1] "48"
## [1] "TX"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "50"
## [1] "VT"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "54"
## [1] "WV"
## [1] "55"
## [1] "WI"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "10"
## [1] "DE"
## [1] "13"
## [1] "GA"
## [1] "16"
## [1] "ID"
## [1] "19"
## [1] "IA"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "21"
## [1] "KY"
## [1] "22"
## [1] "LA"
## [1] "32"
## [1] "NV"
## [1] "33"
## [1] "NH"
## [1] "35"
## [1] "NM"
## [1] "41"
## [1] "OR"
## [1] "44"
## [1] "RI"
## [1] "47"
## [1] "TN"
## [1] "49"
## [1] "UT"
## [1] "51"
## [1] "VA"
## Time is not evenly distributed (possibly due to missing data).
## [1] "53"
## [1] "WA"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "56"
## [1] "WY"
av_states <- unique(temp_short$FIPS_Code_State)
sum_state <- function(df, loc){
print(get_state(av_states[loc]))
test <- temp_short %>%
filter(FIPS_Code_State == av_states[loc]) %>%
group_by(FIPSid) %>%
summarise(n = n()) %>%
filter(n != 25) %>%
nrow(.)
if(test == 0){
return(paste0(get_state(av_states[loc]), " is complete!"))
}
tmp <- df %>%
filter(FIPS_Code_State == av_states[loc])
pt_all <- tmp %>% panelview(Total_Revenue != 0 ~ 1, index = c("FIPSid","year"), xlab = "Year", ylab = "School District", display.all = TRUE, type = "missing", main = get_state(av_states[loc]))
pt_missing <- tmp %>%
group_by(FIPSid) %>%
filter(n() != 25) %>%
panelview(Total_Revenue != 0 ~ 1, index = c("FIPSid","year"), xlab = "Year", ylab = "School District", display.all = TRUE, type = "missing", main = get_state(av_states[loc]))
grid.arrange(pt_all, pt_missing)
print(paste0("Unique school district codes: ", n_distinct(tmp$FIPSid)))
print(paste0("Incomplete school districts: ", test))
return()
}
print_names <- function(state_df){
test <- temp %>%
group_by(FIPSid) %>%
filter(n() != 25) %>% pull(FIPSid) %>% unique
sch_names %>%
filter(FIPSid %in% test) %>%
select(FIPSid, fips, Name, year) %>%
arrange(year) %>%
pivot_wider(id_cols = c(FIPSid, fips), names_from = "year", values_from = "Name") %>%
arrange(fips) %>% print(n = 300)
}
Creates excel in which notes about choices on how to handle missing data Only Delaware, Nevada, and Tennessee have complete panels for all school districts 1997-2021
missing_dat <- tibble("state_code" = character(),
"state" = character(),
"n_incomplete" = numeric(),
"n_total" = numeric(),
"FIPSid" = character(),
"y_missing" = numeric(),
"y_earliest" = numeric(),
"y_latest" = numeric(),
"notes" = character(),
"action" = character())
for(k in av_states){
tmp <- temp_short %>%
filter(FIPS_Code_State == k)
missing_dat <- tmp %>%
group_by(FIPSid) %>%
filter(n() != 25) %>%
summarise(y_missing = 25 - n(),
y_earliest = min(year),
y_latest = max(year),
notes = NA,
action = NA) %>%
mutate(state_code = k,
state = get_state(k),
n_incomplete = n(),
n_total = n_distinct(tmp$FIPSid), .before = 1) %>%
rbind(missing_dat, .)
}
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `y_earliest = min(year)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
## There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `y_earliest = min(year)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
## There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `y_earliest = min(year)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
#missing_dat %>% write.xlsx(here('documentation/missing_data_cleaning.xlsx'))
Duplicates remain….down from 19,026 duplicate entries to 4,955
sch_dist_names <- readRDS(here("data/temp/names_school_level_info.RDS")) %>%
filter(year >=1997) %>%
mutate(Name = case_when(FIPSid =="00185067116455" ~ "KOKOMO CENTER TOWNSHIP CONSOLIDATED SCHOOL CORPORATION",
TRUE ~ Name))
temp_short %>% pull(FIPSid) %>% n_distinct
## [1] 15116
sch_names <- sch_dist_names %>%
# First handle abbreviations
mutate(Name =
gsub("DIST$", "DISTRICT",
gsub("DIST ", "DISTRICT ",
gsub("DIS$", "DISTRICT",
gsub(" DST", " DISTRICT",
gsub("^FT", "FORT",
gsub(" D$", " DISTRICT",
gsub(" DI$", " DISTRICT",
gsub(" DT$", " DISTRICT",
gsub(" DT ", "DISTRICT",
gsub("DISTR |DISTR$", "DISTRICT",
# Handle mispellings
gsub("DISRICT", "DISTRICT",
gsub("DISTT", "DISTRICT",
gsub("DISTRIC$", "DISTRICT",
gsub(" SCOOL$", " SCHOOL",
gsub(" SCOOL ", " SCHOOL ",
gsub(" SCOOLS", " SCHOOLS",
gsub(" Num", " ",
gsub("#", "",
gsub(" ", " ",
str_trim(gsub("[.]", " ", Name), side = "right"))))))))))))))))))))) %>%
mutate(Name =
gsub(" JT ", " JOINT ",
gsub("ELEMENTARY SCHOOL DISTRICT", "SCHOOL DISTRICT",
gsub(" SCHOOLS$", " SCHOOL DISTRICT",
gsub(" SCH$", " SCHOOL DISTRICT",
gsub(" ED ", " EDUCATION ",
# Cooperative
gsub("COOP ", "COOPERATIVE ",
gsub(" COOP$", " COOPERATIVE",
gsub("CO-OP ", "COOPERATIVE ",
gsub("CO-OP$", "COOPERATIVE",
# Bi-county
gsub("BI-CO ", "BI-COUNTY ",
# Elementary, High
gsub("ELEM ", "ELEMENTARY ",
# Unified
gsub("UNIF ", "UNIFIED ",
# Consolidated Independent
gsub(" COMM CONSOLIDATED ", " COMMUNITY CONSOLIDATED ",
gsub(" CONS ", " CONSOLIDATED ",
gsub(" CONSOL ", "CONSOLIDATED ",
gsub(" CONSOLID ", "CONSOLIDATED ",
# Community Corporations
gsub(" CORP$", " CORPORATION",
gsub(" CORP ", " CORPORATION ",
gsub(" COLL ", " COLLEGE ",
gsub("CMTY", "COMMUNITY",
gsub("COMMNTY", "COMMUNITY",
gsub("CMNTY", "COMMUNITY",
gsub(" COMM ", " COMMUNITY ",
# Boards of Education
gsub("BOARD PUBLIC INS", "SCHOOL DISTRICT",
gsub("SCHOOL BOARD", "SCHOOL DISTRICT",
gsub("BOARD OF EDUC", "SCHOOL DISTRICT",
gsub("BOARD OF EDUCATION", "SCHOOL DISTRICT",
gsub(" BD", " BOARD",
# School Districts
gsub(" SCHS", " SCHOOLS",
gsub(" PUB SCHS", " SCHOOLS",
gsub(" SCHOOLS DISTRICT", " SCHOOL DISTRICT",
gsub(" SCHS DISTRICT", " SCHOOL DISTRICT",
gsub(" S$", " SCHOOL DISTRICT",
gsub(" CO ", " COUNTY ",
gsub(" CTY ", " CITY ",
gsub(" CY ", " CITY ",
gsub(" SCH ", " SCHOOL ",
gsub(" SCH SCH ", " SCH ",
Name))))))))))))))))))))))))))))))))))))))) %>%
# Mispelled words
mutate(Name = gsub("MOUNTIAN", "MOUNTAIN",
gsub("SCHOO ", "SCHOOL ",
gsub("LAMOTTE", "LA MOTTE",
gsub("SANDSPRINGS", "SAND SPRINGS", Name))))) %>%
# Remove numerical values
mutate(Name = str_trim(gsub("[[:digit:]]", "", Name), side = "right")) #%>%
# select(FIPSid, Name) %>%
# distinct %>%
# group_by(FIPSid) %>%
# filter(n() > 1) %>%
# arrange(FIPSid) %>% View()
for(k in 1:length(av_states)){
sum_state(temp_short, k)
}
## [1] "AL"
## [1] "Unique school district codes: 138"
## [1] "Incomplete school districts: 11"
## [1] "AZ"
## [1] "Unique school district codes: 250"
## [1] "Incomplete school districts: 29"
## [1] "AR"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 328"
## [1] "Incomplete school districts: 95"
## [1] "CA"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 1178"
## [1] "Incomplete school districts: 206"
## [1] "CO"
## [1] "Unique school district codes: 209"
## [1] "Incomplete school districts: 33"
## [1] "CT"
## [1] "Unique school district codes: 50"
## [1] "Incomplete school districts: 50"
## [1] "FL"
## Time is not evenly distributed (possibly due to missing data).
## [1] "Unique school district codes: 131"
## [1] "Incomplete school districts: 36"
## [1] "IL"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 1120"
## [1] "Incomplete school districts: 265"
## [1] "IN"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 326"
## [1] "Incomplete school districts: 40"
## [1] "KS"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 363"
## [1] "Incomplete school districts: 61"
## [1] "ME"
## [1] "Unique school district codes: 140"
## [1] "Incomplete school districts: 72"
## [1] "MA"
## [1] "Unique school district codes: 93"
## [1] "Incomplete school districts: 13"
## [1] "MI"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 641"
## [1] "Incomplete school districts: 83"
## [1] "MN"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 449"
## [1] "Incomplete school districts: 133"
## [1] "MS"
## [1] "Unique school district codes: 171"
## [1] "Incomplete school districts: 28"
## [1] "MO"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 554"
## [1] "Incomplete school districts: 26"
## [1] "MT"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 387"
## [1] "Incomplete school districts: 92"
## [1] "NE"
## If the number of units is more than 300, we set "gridOff = TRUE".
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 713"
## [1] "Incomplete school districts: 476"
## [1] "NJ"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 561"
## [1] "Incomplete school districts: 43"
## [1] "NY"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 721"
## [1] "Incomplete school districts: 52"
## [1] "ND"
## [1] "Unique school district codes: 273"
## [1] "Incomplete school districts: 105"
## [1] "OH"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 796"
## [1] "Incomplete school districts: 137"
## [1] "OK"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 593"
## [1] "Incomplete school districts: 87"
## [1] "PA"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 646"
## [1] "Incomplete school districts: 135"
## [1] "SC"
## [1] "Unique school district codes: 106"
## [1] "Incomplete school districts: 30"
## [1] "SD"
## [1] "Unique school district codes: 192"
## [1] "Incomplete school districts: 45"
## [1] "TX"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 1114"
## [1] "Incomplete school districts: 52"
## [1] "VT"
## If the number of units is more than 300, we set "gridOff = TRUE".
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 393"
## [1] "Incomplete school districts: 329"
## [1] "WV"
## [1] "Unique school district codes: 74"
## [1] "Incomplete school districts: 19"
## [1] "WI"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 444"
## [1] "Incomplete school districts: 9"
## [1] "DE"
## [1] "GA"
## [1] "Unique school district codes: 196"
## [1] "Incomplete school districts: 17"
## [1] "ID"
## [1] "Unique school district codes: 120"
## [1] "Incomplete school districts: 8"
## [1] "IA"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 406"
## [1] "Incomplete school districts: 67"
## [1] "KY"
## [1] "Unique school district codes: 176"
## [1] "Incomplete school districts: 5"
## [1] "LA"
## [1] "Unique school district codes: 69"
## [1] "Incomplete school districts: 3"
## [1] "NV"
## [1] "NH"
## [1] "Unique school district codes: 171"
## [1] "Incomplete school districts: 11"
## [1] "NM"
## Time is not evenly distributed (possibly due to missing data).
## [1] "Unique school district codes: 96"
## [1] "Incomplete school districts: 1"
## [1] "OR"
## [1] "Unique school district codes: 266"
## [1] "Incomplete school districts: 43"
## [1] "RI"
## [1] "Unique school district codes: 8"
## [1] "Incomplete school districts: 4"
## [1] "TN"
## [1] "UT"
## [1] "Unique school district codes: 41"
## [1] "Incomplete school districts: 1"
## [1] "VA"
## Time is not evenly distributed (possibly due to missing data).
## Time is not evenly distributed (possibly due to missing data).
## [1] "Unique school district codes: 1"
## [1] "Incomplete school districts: 1"
## [1] "WA"
## If the number of units is more than 300, we set "gridOff = TRUE".
## [1] "Unique school district codes: 305"
## [1] "Incomplete school districts: 10"
## [1] "WY"
## Time is not evenly distributed (possibly due to missing data).
## [1] "Unique school district codes: 57"
## [1] "Incomplete school districts: 2"
temp <- sum_state(temp_short, 1)
## [1] "AL"
## [1] "Unique school district codes: 138"
## [1] "Incomplete school districts: 11"
temp <- sum_state(temp_short, 2)
## [1] "AZ"
## [1] "Unique school district codes: 250"
## [1] "Incomplete school districts: 29"
#sch_names %>%
# filter(FIPSid %in% test) %>%
# select(FIPSid, fips, Name, year) %>%
# arrange(year) %>%
# pivot_wider(id_cols = c(FIPSid, fips), names_from = "year", values_from = "Name") %>%
# arrange(fips) %>% print(n = 29)